In [1]:
import os
import sys
# print(f"Current Working Directory --> {os.getcwd()}")
#Add one directory above research
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) # Get the parent directory
sys.path.append(parent_dir)
current_working_dir = %pwd
print(f"Parent Dir >>> {parent_dir}")
print(f"Current Working Dir >>> {current_working_dir}")
# from configs import cfgs # Absolute import
Parent Dir >>> C:\Users\maz\dev\Projects_\alzheimer Current Working Dir >>> C:\Users\maz\dev\Projects_\alzheimer\research
In [2]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
Imports¶
In [3]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.pipeline import Pipeline
# Custom Functions
from utils import *
Paths¶
In [4]:
# Create a path object
# dataset_dir = cfgs["DATASET_DIR"]
dataset_dir = "..//dataset//modified"
dataset_path = Path(dataset_dir)
print("Dataset Path")
print("*"*12)
print(f"Dataset: {dataset_path}")
print("\n")
# Find all CSV files inside the directory
files = list(dataset_path.glob("*.csv"))
print("Files in Dataset Dir:")
print("*"*21)
for file_path in files:
print(file_path.name) # Print only the file name
# Combining multiple paths
# path_metadata = dataset_path / "MetaData.xlsx"
path_train = dataset_path / "train.csv"
path_test = dataset_path / "test.csv"
path_train_cleaned = dataset_path / "train_v01.csv"
print("\n")
print(f"Train File Path --> {path_train}")
print(f"Train File Path | Cleaner Version --> {path_train_cleaned}")
# print("\n")
print(f"Test File Path --> {path_test}")
Dataset Path ************ Dataset: ..\dataset\modified Files in Dataset Dir: ********************* test.csv test_features.csv test_labels.csv train.csv train_features.csv train_labels.csv train_without_featEng.csv train_with_featEng.csv Train File Path --> ..\dataset\modified\train.csv Train File Path | Cleaner Version --> ..\dataset\modified\train_v01.csv Test File Path --> ..\dataset\modified\test.csv
Data Loading¶
In [5]:
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_rows", None)
In [6]:
try:
dfTrain = pd.read_csv(path_train, encoding = 'utf8')
display(dfTrain.head(2))
print(dfTrain.shape)
except FileNotFoundError:
print("Error: 'train.csv' not found. Please ensure the file is in the correct location.")
dfTrain = None
| UID | Year | composite_score | Age_03 | Urban_03 | Married_03 | Marriages_03 | Education_03 | Num_Living_Child_03 | Migration_03 | ... | Meet_FnF_12 | SocialActivities_12 | AttendReligiousServices_12 | a16a_12 | YrsLivedInUSA_12 | a22_12 | a33b_12 | SpeaksEnglish_12 | HousingEnvironment_12 | PredictionYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | aard | 2021 | 104 | 50-59 | Urban | Widowed | 1.0 | 7-9 Years | 1 or 2 | 0.0 | ... | Once a week | Never | 1.0 | NaN | NaN | NaN | NaN | 0.0 | Concrete | 9 |
| 1 | abme | 2021 | 106 | 50-59 | Rural | Married or In Civil Union | 1.0 | 1-5 Years | 5 or 6 | 0.0 | ... | Never | Never | 0.0 | NaN | NaN | NaN | NaN | 0.0 | Concrete | 9 |
2 rows × 185 columns
(2889, 185)
Shape¶
In [7]:
print(f"In the training data we have " + str(dfTrain.shape[0]) + " rows", str(dfTrain.shape[1]) + " columns")
In the training data we have 2889 rows 185 columns
Examine data¶
In [8]:
#dfTrain.info(verbose=True, show_counts=True)
Features Data Type Conversion¶
- We will convert
Yeardata type to category for memory efficiency - We will convert columns with
objecttype tocategoryif those have low cardinality - Also there are some columns whose infered data type is
float64but actually these arebooleantype. We will convert them toCategoricalsinceBoolean variablesare categorical by nature (True, False, NA)

- Convert to Category¶
In [9]:
# Get all dtypes as a Series
all_dtypes = dfTrain.dtypes
print("\nAll dtypes (Series):")
print(all_dtypes)
print("-" * 30)
# Get unique dtypes
unique_dtypes = dfTrain.dtypes.unique()
print("\nUnique dtypes (NumPy array of dtype objects):")
print(unique_dtypes)
print("-" * 30)
All dtypes (Series):
UID object
Year int64
composite_score int64
Age_03 object
Urban_03 object
...
a22_12 object
a33b_12 object
SpeaksEnglish_12 float64
HousingEnvironment_12 object
PredictionYear int64
Length: 185, dtype: object
------------------------------
Unique dtypes (NumPy array of dtype objects):
[dtype('O') dtype('int64') dtype('float64')]
------------------------------
In [10]:
# print("*" * 44)
# print(f"Converting `object` data types to `Category`")
# print("*" * 44)
# # Convert to 'Year ' Column to category
# df['Year'] = df['Year'].astype('category')
# # Convert `Object` data type to Category. Since reading from .csv pandas don't infer them automatically as `Category`
# df, converted_columns_train = identify_and_convert_object_to_category(df, threshold_ratio=0.1, max_unique=50)
# print("\n--- After Conversion ---")
# print("Examine Converted DataFrame dtypes:")
# print(df.dtypes)
# print("\nColumns converted to 'category':", converted_columns_train, "\n")
# # Check the categories in a converted column
# if 'Age_Group' in converted_columns_train:
# print(f"\nCategories in 'Age_Group' column: {df['Age_Group'].cat.categories.tolist()}")
# if 'Urban_Status' in converted_columns_train:
# print(f"Categories in 'Urban_Status' column: {df['Urban_Status'].cat.categories.tolist()}")
# print("*" * 65)
# print(f"Converting `Float` data types that have `0` and `1` to `Category`")
# print("*" * 65)
# # Convert Boolean to Category
# df, cat_cols = convert_float_to_bool(df)
# df, bool_cols = convert_boolean_to_category(df)
In [11]:
# --- Define the Pipeline ---
# Note: The order matters if transformations depend on previous ones,
# though in this case, they mostly operate on distinct initial dtypes.
print("*" * 80)
data_type_conversion_pipeline = Pipeline([
('specific_categorizer', SpecificColumnCategorizer(columns_to_categorize=['Year'])),
('object_to_category', ObjectToCategoryTransformer(threshold_ratio=0.1, max_unique=50)), # Adjusted threshold for sample
('float_to_category', FloatToCategoryTransformer()),
# ('bool_to_category', BooleanToCategoryTransformer())
])
print(f"PipeLine | Data Types Conversion: {data_type_conversion_pipeline}")
print("*" * 80)
********************************************************************************
PipeLine | Data Types Conversion: Pipeline(steps=[('specific_categorizer',
SpecificColumnCategorizer(columns_to_categorize=['Year'])),
('object_to_category', ObjectToCategoryTransformer()),
('float_to_category', FloatToCategoryTransformer())])
********************************************************************************
In [12]:
%%capture
# --- Apply the Pipeline ---
print("*" * 49)
print("--- Applying Pipeline | Data Types Conversion ---")
print("*" * 49)
df = data_type_conversion_pipeline.fit_transform(dfTrain)
In [13]:
print(f"--- PipeLine Completed ---")
--- PipeLine Completed ---
In [14]:
print("*" * 30)
print("After Conversion of Data Types")
print("*" * 30)
# Get all dtypes as a Series
all_dtypes = df.dtypes
print("-" * 20)
print("All dtypes (Series):")
print("-" * 20)
print(f"All Data Types -> {all_dtypes}")
# Get unique dtypes
unique_dtypes = df.dtypes.unique()
print("-" * 45)
print("Unique dtypes (NumPy array of dtype objects):")
print("-" * 45)
print(f"Unique Data Types -> {unique_dtypes}")
******************************
After Conversion of Data Types
******************************
--------------------
All dtypes (Series):
--------------------
All Data Types -> UID object
Year category
composite_score int64
Age_03 category
Urban_03 category
...
a22_12 category
a33b_12 category
SpeaksEnglish_12 category
HousingEnvironment_12 category
PredictionYear int64
Length: 185, dtype: object
---------------------------------------------
Unique dtypes (NumPy array of dtype objects):
---------------------------------------------
Unique Data Types -> [dtype('O')
CategoricalDtype(categories=[2016, 2021], ordered=False, categories_dtype=int64)
dtype('int64')
CategoricalDtype(categories=['49 or younger', '50-59', '60-69', '70-79', '80+'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Rural', 'Urban'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Married or In Civil Union', 'Separated or Divorced',
'Single', 'Widowed'],
, ordered=False, categories_dtype=object)
dtype('float64')
CategoricalDtype(categories=['1-5 Years', '10+ Years', '6 Years', '7-9 Years',
'No education'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['1 or 2', '3 or 4', '5 or 6', '7+', 'No children'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=[0.0, 1.0], ordered=False, categories_dtype=float64)
CategoricalDtype(categories=['Excellent', 'Fair', 'Good', 'Poor', 'Very Good'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Morbidly Obese', 'Normal Weight', 'Obese', 'Over Weight',
'Under Weight'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Both', 'Patient', 'Spouse'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Currently Looking for Work', 'Currently Working',
'Dedicated to Household Chores',
'Retired, Incapacitated, or Does not Work'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['50-59', '60-69', '70-79', '80+'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Agrees', 'Disagrees', 'Neither Agrees nor Disagrees'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Man', 'Woman'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['More than Primary', 'Primary', 'Some Primary'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Administrative Support Staff',
'Artisans and Workers in Production, Repair, Maintenance',
'Asst/Laborers etc in Ind. Production, Repair, Maintenance',
'Department Heads/Coordinators/Supervisors in Admin and Service Activities',
'Domestic Workers',
'Drivers and Asst Drivers of Mobile Machinery and Transport Vehicles',
'Educators', 'Merchants and Sales Representatives',
'Officials and Directors Public, Private, and Social Sectors',
'Operators of Fixed Machinery and Equipment for Ind. Production',
'Other Workers', 'Professionals',
'Safety and Security Personnel', 'Technicians',
'Traveling Salespeople and Traveling Salespeople of Services',
'Workers in Agriculture, Livestock, Forestry, and Fishing',
'Workers in Art, Shows, and Sports',
'Workers in the Service Industry'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Family', 'Health', 'Laid off', 'Other', 'Retired'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Not Important', 'Somewhat Important', 'Very Important'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Administrative Support Staff',
'Artisans and Workers in Production, Repair, Maintenance',
'Asst/Laborers etc in Ind. Production, Repair, Maintenance',
'Bosses/Supervisors etc in Artistic, Ind. Production, Repair, Maintenance Activities',
'Department Heads/Coordinators/Supervisors in Admin and Service Activities',
'Domestic Workers',
'Drivers and Asst Drivers of Mobile Machinery and Transport Vehicles',
'Educators', 'Merchants and Sales Representatives',
'Officials and Directors Public, Private, and Social Sectors',
'Operators of Fixed Machinery and Equipment for Ind. Production',
'Professionals', 'Safety and Security Personnel',
'Technicians',
'Traveling Salespeople and Traveling Salespeople of Services',
'Workers in Agriculture, Livestock, Forestry, and Fishing',
'Workers in Art, Shows, and Sports',
'Workers in the Service Industry'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['2 or 3 Times a Week', '4 or Moretimes a Week',
'Almost Everyday', 'Almost Never, Sporadic',
'Every other Week', 'Never', 'Once a Month', 'Once a Week',
'Once a week'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['2 or 3 Times a Week', '4 or Moretimes a week',
'Almost Everyday', 'Almost Never, Sporadic',
'Every other Week', 'Never', 'Once a Month', 'Once a week'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Agriculture/ Animal breeding 01',
'Childcare or domestic work 04',
'Construction/ Manufacturing/ Mining 02', 'Did not work 08',
'Gardening or maintenance 03', 'Other 07',
'Restaurant/ Store/ Hotel 05'],
, ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Citizen', 'Neither', 'Permanent Resident'], ordered=False, categories_dtype=object)
CategoricalDtype(categories=['Concrete', 'Mud', 'Wood, Mosaic, or other Covering'], ordered=False, categories_dtype=object)]
In [15]:
df.info(verbose=True, show_counts=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2889 entries, 0 to 2888 Data columns (total 185 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UID 2889 non-null object 1 Year 2889 non-null category 2 composite_score 2889 non-null int64 3 Age_03 2887 non-null category 4 Urban_03 2889 non-null category 5 Married_03 2889 non-null category 6 Marriages_03 2861 non-null float64 7 Education_03 2875 non-null category 8 Num_Living_Child_03 2872 non-null category 9 Migration_03 2887 non-null category 10 GlobalHealth_03 2721 non-null category 11 ADL_Dress_03 2722 non-null category 12 ADL_Walk_03 2880 non-null category 13 ADL_Bath_03 2880 non-null category 14 ADL_Eat_03 2878 non-null category 15 ADL_Bed_03 2880 non-null category 16 ADL_Toilet_03 2880 non-null category 17 Num_ADL_03 2878 non-null float64 18 IADL_Money_03 2722 non-null category 19 IADL_Meds_03 2722 non-null category 20 IADL_Shop_03 2722 non-null category 21 IADL_Meals_03 2722 non-null category 22 Num_IADL_03 2722 non-null float64 23 Depressed_03 2718 non-null category 24 Hard_03 2721 non-null category 25 Restless_03 2721 non-null category 26 Happy_03 2714 non-null category 27 Lonely_03 2721 non-null category 28 Enjoy_03 2714 non-null category 29 Sad_03 2722 non-null category 30 Tired_03 2722 non-null category 31 Energetic_03 2713 non-null category 32 Num_CES-D_Symptoms_03 2715 non-null float64 33 CES-D_Symptoms_03 2715 non-null category 34 Hypertension_03 2889 non-null category 35 Diabetes_03 2889 non-null category 36 Respiratory_Illness_03 2889 non-null category 37 Arthritis_03 2889 non-null category 38 HeartAttack_03 2887 non-null category 39 Stroke_03 2889 non-null category 40 Cancer_03 2889 non-null category 41 Num_Illnesses_03 2887 non-null float64 42 BMI_03 2041 non-null category 43 Exercise_03 2722 non-null category 44 Alcohol_03 2887 non-null category 45 Tobacco_03 2888 non-null category 46 Test_Cholestrol_03 2715 non-null category 47 Test_Tuber_03 2708 non-null category 48 Test_Diabetes_03 2720 non-null category 49 Test_BloodPress_03 2720 non-null category 50 Hospitalized_03 2889 non-null category 51 Visit_Dr_03 2885 non-null category 52 OutPatient_03 2889 non-null category 53 Visit_Dental_03 2889 non-null category 54 imss_03 2889 non-null category 55 issste_03 2889 non-null category 56 pem_def_mar_03 2889 non-null category 57 insur_private_03 2889 non-null category 58 insur_other_03 2889 non-null category 59 Insured_03 2889 non-null category 60 FamilyDecisions_03 2046 non-null category 61 Employment_03 2886 non-null category 62 Age_12 2782 non-null category 63 Urban_12 2782 non-null category 64 Married_12 2782 non-null category 65 Marriages_12 2749 non-null float64 66 Education_12 2768 non-null category 67 Num_Living_Child_12 2756 non-null category 68 Migration_12 2782 non-null category 69 GlobalHealth_12 2682 non-null category 70 ADL_Dress_12 2682 non-null category 71 ADL_Walk_12 2773 non-null category 72 ADL_Bath_12 2766 non-null category 73 ADL_Eat_12 2771 non-null category 74 ADL_Bed_12 2772 non-null category 75 ADL_Toilet_12 2772 non-null category 76 Num_ADL_12 2763 non-null float64 77 IADL_Money_12 2681 non-null category 78 IADL_Meds_12 2681 non-null category 79 IADL_Shop_12 2681 non-null category 80 IADL_Meals_12 2681 non-null category 81 Num_IADL_12 2680 non-null float64 82 Depressed_12 2676 non-null category 83 Hard_12 2681 non-null category 84 Restless_12 2680 non-null category 85 Happy_12 2670 non-null category 86 Lonely_12 2679 non-null category 87 Enjoy_12 2676 non-null category 88 Sad_12 2677 non-null category 89 Tired_12 2680 non-null category 90 Energetic_12 2679 non-null category 91 Num_CES-D_Symptoms_12 2660 non-null float64 92 CES-D_Symptoms_12 2660 non-null category 93 Hypertension_12 2777 non-null category 94 Diabetes_12 2777 non-null category 95 Respiratory_Illness_12 2779 non-null category 96 Arthritis_12 2776 non-null category 97 HeartAttack_12 2778 non-null category 98 Stroke_12 2780 non-null category 99 Cancer_12 2777 non-null category 100 Num_Illnesses_12 2762 non-null float64 101 BMI_12 2470 non-null category 102 Exercise_12 2682 non-null category 103 Alcohol_12 2782 non-null category 104 Tobacco_12 2782 non-null category 105 Test_Cholestrol_12 2672 non-null category 106 Test_Tuber_12 2639 non-null category 107 Test_Diabetes_12 2678 non-null category 108 Test_BloodPress_12 2680 non-null category 109 Hospitalized_12 2782 non-null category 110 Visit_Dr_12 2775 non-null category 111 OutPatient_12 2780 non-null category 112 Visit_Dental_12 2775 non-null category 113 imss_12 2779 non-null category 114 issste_12 2781 non-null category 115 pem_def_mar_12 2782 non-null category 116 insur_private_12 2780 non-null category 117 insur_other_12 2780 non-null category 118 Insured_12 2782 non-null category 119 FamilyDecisions_12 1739 non-null category 120 Employment_12 2782 non-null category 121 Vax_Flu_12 2673 non-null category 122 Vax_Pneu_12 2612 non-null category 123 seg_pop_12 2782 non-null category 124 CareAdult_12 2682 non-null category 125 CareChild_12 2680 non-null category 126 Volunteer_12 2680 non-null category 127 AttendsClass_12 2681 non-null category 128 AttendsClub_12 2682 non-null category 129 Reads_12 2672 non-null category 130 Games_12 2680 non-null category 131 TableGames_12 2680 non-null category 132 UseElectronicDevices_12 2681 non-null category 133 HouseMaintenance_12 2682 non-null category 134 TV_12 2682 non-null category 135 Sewing_12 2682 non-null category 136 Satement_Ideal_12 2626 non-null category 137 Satement_Excel_12 2659 non-null category 138 Satement_Fine_12 2676 non-null category 139 COSAS_IMP_12 2676 non-null category 140 WouldntChange_12 2653 non-null category 141 Memory_12 2658 non-null category 142 Gender 2889 non-null category 143 EducationMother 1172 non-null category 144 EducationFather 1344 non-null category 145 SpouseGender_03 2260 non-null category 146 JobHrsWeekly_03 1451 non-null float64 147 JobCatLongest_03 400 non-null category 148 YrJobEnded_03 456 non-null float64 149 ReasonJobEnded_03 471 non-null category 150 Earnings_03 2886 non-null float64 151 SpouseEarnings_03 2186 non-null float64 152 hincome_03 2859 non-null float64 153 hinc_business_03 2888 non-null float64 154 hinc_rent_03 2888 non-null float64 155 hinc_assets_03 2888 non-null float64 156 hinc_cap_03 2888 non-null float64 157 Pension_03 2886 non-null float64 158 SpousePension_03 2186 non-null float64 159 Religon_Imp_03 2687 non-null category 160 SpouseGender_12 1778 non-null category 161 JobHrsWeekly_12 909 non-null float64 162 JobCatLongest_12 1161 non-null category 163 YrJobEnded_12 288 non-null float64 164 ReasonJobEnded_12 298 non-null category 165 Earnings_12 2782 non-null float64 166 SpouseEarnings_12 1778 non-null float64 167 hincome_12 2752 non-null float64 168 hinc_business_12 2782 non-null float64 169 hinc_rent_12 2782 non-null float64 170 hinc_assets_12 2782 non-null float64 171 hinc_cap_12 2782 non-null float64 172 Pension_12 2782 non-null float64 173 SpousePension_12 1778 non-null float64 174 Religon_Imp_12 2677 non-null category 175 Meet_FnF_12 2675 non-null category 176 SocialActivities_12 2682 non-null category 177 AttendReligiousServices_12 2680 non-null category 178 a16a_12 33 non-null float64 179 YrsLivedInUSA_12 53 non-null float64 180 a22_12 46 non-null category 181 a33b_12 53 non-null category 182 SpeaksEnglish_12 2679 non-null category 183 HousingEnvironment_12 2800 non-null category 184 PredictionYear 2889 non-null int64 dtypes: category(148), float64(34), int64(2), object(1) memory usage: 1.2+ MB
Descriptive statistics for numerical features¶
In [16]:
print("*" * 46)
print("Descriptive Statistics for Numerical Features:")
print("*" * 46)
print("\n")
display(df.describe())
********************************************** Descriptive Statistics for Numerical Features: **********************************************
| composite_score | Marriages_03 | Num_ADL_03 | Num_IADL_03 | Num_CES-D_Symptoms_03 | Num_Illnesses_03 | Marriages_12 | Num_ADL_12 | Num_IADL_12 | Num_CES-D_Symptoms_12 | ... | hincome_12 | hinc_business_12 | hinc_rent_12 | hinc_assets_12 | hinc_cap_12 | Pension_12 | SpousePension_12 | a16a_12 | YrsLivedInUSA_12 | PredictionYear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2889.000000 | 2861.000000 | 2878.000000 | 2722.000000 | 2715.000000 | 2887.000000 | 2749.000000 | 2763.000000 | 2680.000000 | 2660.000000 | ... | 2.752000e+03 | 2.782000e+03 | 2.782000e+03 | 2782.000000 | 2.782000e+03 | 2782.000000 | 1.778000e+03 | 33.000000 | 53.000000 | 2889.000000 |
| mean | 146.141918 | 1.126879 | 0.070883 | 0.045555 | 3.479190 | 0.971943 | 1.197890 | 0.212450 | 0.143657 | 3.366917 | ... | 8.166788e+04 | 2.999641e+04 | 6.470165e+02 | 833.932423 | 3.147376e+04 | 15920.201294 | 1.513498e+04 | 1973.848485 | 7.490566 | 7.243337 |
| std | 59.078730 | 0.469691 | 0.402262 | 0.285861 | 2.648645 | 0.956563 | 0.611733 | 0.665236 | 0.478618 | 2.624120 | ... | 7.197490e+05 | 6.912140e+05 | 2.812897e+04 | 11193.041142 | 6.918793e+05 | 46219.907592 | 5.433393e+04 | 18.246783 | 11.943171 | 2.387346 |
| min | 8.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | -1.900000e+05 | 0.000000e+00 | -2.100000e+05 | 0.000000 | -2.100000e+05 | 0.000000 | 0.000000e+00 | 1942.000000 | 1.000000 | 4.000000 |
| 25% | 105.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 1960.000000 | 1.000000 | 4.000000 |
| 50% | 146.000000 | 1.000000 | 0.000000 | 0.000000 | 3.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 3.000000 | ... | 2.000000e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 1970.000000 | 3.000000 | 9.000000 |
| 75% | 186.000000 | 1.000000 | 0.000000 | 0.000000 | 5.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 5.000000 | ... | 6.000000e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 20000.000000 | 0.000000e+00 | 1987.000000 | 7.000000 | 9.000000 |
| max | 334.000000 | 5.000000 | 5.000000 | 4.000000 | 9.000000 | 5.000000 | 7.000000 | 5.000000 | 4.000000 | 9.000000 | ... | 3.602000e+07 | 3.600000e+07 | 1.200000e+06 | 360000.000000 | 3.600000e+07 | 960000.000000 | 1.200000e+06 | 2012.000000 | 52.000000 | 9.000000 |
8 rows × 36 columns
Examine Categorical Features (for Year 2003 and 2012)¶
In [17]:
cols = categorize_columns_by_suffix(df)
cols_03 = cols['cols_03']
cols_12 = cols['cols_12']
cols_rest = cols['cols_rest']
cols_rest.remove("UID")
- Categorical features in columns with suffix _03¶
In [18]:
# Get the `_03` suffix columns
cat_cols_03 = df[cols_03].select_dtypes(include=['object', 'category']).columns.to_list()
# # Now Check for Unique Values and Counts
# print("*" * 80)
# print("Unique Values and Counts for Categorical Features in Columns with Suffix '_03'")
# print("*" * 80)
# for col in cat_cols_03:
# print(f"\nColumn: {col}")
# print("*" * 30)
# display(df_cat_03[col].value_counts())
#### Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df, cat_cols_03, n_cols=3, figsize_per_plot=(10, 5), title_fontsize=18, axis_labelsize=16, tick_fontsize=16)
#### Plot in Plotly ####
# plot_categorical_distributions_plotly(df, cat_cols_03, width=900, height=450)
--- Plotting Categorical Distributions (Grid Layout) --- Number of valid columns: 58
- Categorical features in columns with suffix _12¶
In [19]:
# Get the `_12` suffix columns
cat_cols_12 = df[cols_12].select_dtypes(include=['object', 'category']).columns.to_list()
#### Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df, cat_cols_12, n_cols=3, figsize_per_plot=(10, 5), title_fontsize=18, axis_labelsize=16, tick_fontsize=16)
#### Plot in Plotly ####
# plot_categorical_distributions_plotly(df_cat_12, cat_cols_12, width=800, height=400)
--- Plotting Categorical Distributions (Grid Layout) --- Number of valid columns: 86
- Categorical features in rest of the columns¶
In [20]:
# Get the category columns besides columns with suffix `_03` and `_12`
cat_cols_rest = df[cols_rest].select_dtypes(include=['object', 'category']).columns.to_list()
#### Plot in Seaborn ####
pd.set_option('future.no_silent_downcasting', True)
plot_categorical_distributions_grid(df[cols_rest], cat_cols_rest, n_cols=2, figsize_per_plot=(12, 6),
title_fontsize=14, axis_labelsize=12, tick_fontsize=12)
#### Plot in Plotly ####
# plot_categorical_distributions_plotly(df[cols_rest], cat_cols_rest, width=900, height=450)
--- Plotting Categorical Distributions (Grid Layout) --- Number of valid columns: 4
Examine Numerical Features (For Year 2003 & 2012)¶
- Numerical Features in columns with suffix _03¶
In [21]:
num_cols_03 = df[cols_03].select_dtypes(include=['number']).columns.to_list()
# df[num_cols_03].hist(bins = 30, figsize = (20, 20), color = 'b');
#### Plot Histogram and Violin ####
plot_hist_grid(df, num_cols_03, bins=30, n_cols=4, figsize_per_plot=(5, 4),
title_fontsize=14, xlabel_fontsize=11, ylabel_fontsize=11, tick_fontsize=9
)
plot_violin_grid(df,
numeric_cols=num_cols_03, n_cols=2, figsize_per_plot=(10, 5),
title_fontsize=18, tick_fontsize=14, show_outliers=True
)
Plotting histograms...Total plots: 16
Plotting Violin Plots...Number of Plots: 16
- Numerical Features in columns with suffix _12¶
In [22]:
num_cols_12 = df[cols_12].select_dtypes(include=['number']).columns.to_list()
# df[num_cols_12].hist(bins = 30, figsize = (20, 20), color = 'r');
#### Plot Histogram and Violin ####
plot_hist_grid(df, num_cols_12, bins=30, n_cols=4, figsize_per_plot=(5, 4),
title_fontsize=14, xlabel_fontsize=11, ylabel_fontsize=11, tick_fontsize=9
)
plot_violin_grid(df,
numeric_cols=num_cols_12, n_cols=2, figsize_per_plot=(10, 5),
title_fontsize=18, tick_fontsize=14, show_outliers=True
)
Plotting histograms...Total plots: 18
Plotting Violin Plots...Number of Plots: 18
Numerical features in rest of the columns¶
In [23]:
# Get the Numerical columns besides columns with suffix `_03` and `_12`
num_cols_rest = df[cols_rest].select_dtypes(include=['number']).columns.to_list()
print(f"Numberical Features in rest of the columns: {num_cols_rest}\n")
# print("Plotting `PredictionYear`")
# mu = df['PredictionYear'].mean() # mean of distribution
# sigma = df['PredictionYear'].std() # standard deviation of distribution
# num_bins = 40
# df['PredictionYear'].plot.hist(bins = 50, alpha = 0.5, color = 'r', figsize = (6, 3))
# plt.ylabel('Num of Values')
# plt.xlabel('PredictionYear')
# plt.title('Histogram: mean = ' + str(round(mu, 4)) + ', sigma = ' + str(round(sigma, 4)))
# plt.grid()
# plt.show()
### Or We can treat this column as categorical as well since it has only two distinct values
# plot_categorical_distributions_plotly(df, ['PredictionYear'], width=600, height=300)
plot_categorical_distributions_seaborn(df, ['PredictionYear'], figsize = (6, 3))
Numberical Features in rest of the columns: ['composite_score', 'PredictionYear'] --- Plotting Categorical Distributions --- Input columns: ['PredictionYear'], Number of Cols: 1
Target variable analysis¶
In [24]:
# plot_numeric_distribution_plotly(df, 'composite_score')
# plot_box_plotly(df, 'composite_score')
# # --- Plot | Plotly ---
# plot_histogram_and_boxplot(df, 'composite_score')
# --- Plot | SNS
plot_histogram_and_boxplot_sns(df, 'composite_score', figsize = (10, 4))
Examine Missing values¶
In [25]:
print("*" * 23)
print("Missing Values:")
print("*" * 23)
# # --- Plot | Plotly ---
# plot_missing_value_distribution(df)
# --- Plot | SNS
# Only Top N columns will be displayed
plot_missing_value_distribution_sns(df, top_n=30, figsize = (14, 8))
total_missing = df.isnull().sum().sum()
print(f"Total Missing Values: {total_missing}\n")
missing_percentage = pd.DataFrame(df.isnull().sum() * 100 / len(df), columns=["%age of Missing Values"])
print("*" * 41)
print("Percentage of Missing Values per Feature:")
print("*" * 41)
display(missing_percentage)
print("*" * 43)
print("Features with More than 40% Missing Values:")
print("*" * 43)
display(missing_percentage[missing_percentage["%age of Missing Values"] > 40])
print("*" * 43)
print("Features with less than 40% Missing Values:")
print("*" * 43)
missing_percentage[(missing_percentage["%age of Missing Values"] > 0) & (missing_percentage["%age of Missing Values"] < 40)]
*********************** Missing Values: ***********************
Total Missing Values: 60409 ***************************************** Percentage of Missing Values per Feature: *****************************************
| %age of Missing Values | |
|---|---|
| UID | 0.000000 |
| Year | 0.000000 |
| composite_score | 0.000000 |
| Age_03 | 0.069228 |
| Urban_03 | 0.000000 |
| ... | ... |
| a22_12 | 98.407754 |
| a33b_12 | 98.165455 |
| SpeaksEnglish_12 | 7.268951 |
| HousingEnvironment_12 | 3.080651 |
| PredictionYear | 0.000000 |
185 rows × 1 columns
******************************************* Features with More than 40% Missing Values: *******************************************
| %age of Missing Values | |
|---|---|
| EducationMother | 59.432330 |
| EducationFather | 53.478712 |
| JobHrsWeekly_03 | 49.775009 |
| JobCatLongest_03 | 86.154379 |
| YrJobEnded_03 | 84.215992 |
| ReasonJobEnded_03 | 83.696781 |
| JobHrsWeekly_12 | 68.535826 |
| JobCatLongest_12 | 59.813084 |
| YrJobEnded_12 | 90.031153 |
| ReasonJobEnded_12 | 89.685012 |
| a16a_12 | 98.857736 |
| YrsLivedInUSA_12 | 98.165455 |
| a22_12 | 98.407754 |
| a33b_12 | 98.165455 |
******************************************* Features with less than 40% Missing Values: *******************************************
Out[25]:
| %age of Missing Values | |
|---|---|
| Age_03 | 0.069228 |
| Marriages_03 | 0.969193 |
| Education_03 | 0.484597 |
| Num_Living_Child_03 | 0.588439 |
| Migration_03 | 0.069228 |
| ... | ... |
| Meet_FnF_12 | 7.407407 |
| SocialActivities_12 | 7.165109 |
| AttendReligiousServices_12 | 7.234337 |
| SpeaksEnglish_12 | 7.268951 |
| HousingEnvironment_12 | 3.080651 |
149 rows × 1 columns
Data Preparation¶
- Dropping Features¶
- Drop Columns
In [26]:
# # These are rudundant features
# COLS_TO_DROP = ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12',
# 'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12',
# 'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']
# print(f"Redundant Features: {COLS_TO_DROP}")
# print(f"Number of Redundant Features: {len(COLS_TO_DROP)}\n")
# # We will Drop Features which have more than 70% missing values
# na_cols_to_drop = missing_percentage[missing_percentage["%age of Missing Values"] >= 70]
# na_cols_to_drop = na_cols_to_drop.index.to_list()
# print(f"Features with more than 70% missing values: {na_cols_to_drop}")
# print(f"Number of eatures with more than 70% missing values: {len(na_cols_to_drop)}\n")
# # Adding two columns
# COLS_TO_DROP.extend(na_cols_to_drop)
# print(f"Features to Drop: {COLS_TO_DROP}\n")
# print(f"Number of Features to Drop: {len(COLS_TO_DROP)}\n")
# # Now Dropping Features
# columns_actually_dropped = []
# for col in COLS_TO_DROP:
# if col in df.columns:
# df = df.drop(col, axis=1)
# columns_actually_dropped.append(col)
# else:
# print(f"Warning: Column '{col}' not found in DataFrame. Skipping.")
# # Checking
# check_lists = lambda COLS_TO_DROP, columns_actually_dropped: sorted(COLS_TO_DROP) == sorted(columns_actually_dropped)
# print("*" * 31)
# print("Features Dropped | Successfully" if check_lists(COLS_TO_DROP, columns_actually_dropped) else "Features Dropped | UnSuccessfull")
# print("*" * 31)
In [27]:
# These are rudundant features
COLS_TO_DROP = ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12',
'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12',
'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']
dropColumns = Pipeline([
('drop_columns', ColumnDropper(columns_to_drop=COLS_TO_DROP)),
])
print("*" * 80)
print(f"PipeLine | Drop Columns: {dropColumns}")
print("*" * 80)
# --- Applying Pipeline | Drop Columns ---
print("*" * 40)
print("--- Applying Pipeline | Drop Columns ---")
print("*" * 40)
df = dropColumns.fit_transform(df)
********************************************************************************
PipeLine | Drop Columns: Pipeline(steps=[('drop_columns',
ColumnDropper(columns_to_drop=['UID', 'imss_03', 'imss_12',
'issste_03', 'issste_12',
'pem_def_mar_03',
'pem_def_mar_12',
'insur_private_03',
'insur_private_12',
'insur_other_03',
'insur_other_12', 'seg_pop_12',
'Tired_03', 'Tired_12',
'Happy_03', 'Happy_12']))])
********************************************************************************
****************************************
--- Applying Pipeline | Drop Columns ---
****************************************
ColumnDropper: Dropped columns: ['UID', 'imss_03', 'imss_12', 'issste_03', 'issste_12', 'pem_def_mar_03', 'pem_def_mar_12', 'insur_private_03', 'insur_private_12', 'insur_other_03', 'insur_other_12', 'seg_pop_12', 'Tired_03', 'Tired_12', 'Happy_03', 'Happy_12']
- Drop Columns with High Missing Values
In [28]:
dropColumnsHighNA = Pipeline([
('drop_columns_high_na', DropColumnsHighNA(threshold=70.0)),
])
print("*" * 80)
print(f"PipeLine | Drop Columns With High Missing Values: {dropColumnsHighNA}")
print("*" * 80)
# --- Applying Pipeline | Drop Columns With High Missing Values ---
print("*" * 65)
print("--- Applying Pipeline | Drop Columns With High Missing Values ---")
print("*" * 65)
df = dropColumnsHighNA.fit_transform(df)
********************************************************************************
PipeLine | Drop Columns With High Missing Values: Pipeline(steps=[('drop_columns_high_na', DropColumnsHighNA())])
********************************************************************************
*****************************************************************
--- Applying Pipeline | Drop Columns With High Missing Values ---
*****************************************************************
DropColumnsHighNA: Dropped columns: ['JobCatLongest_03', 'YrJobEnded_03', 'ReasonJobEnded_03', 'YrJobEnded_12', 'ReasonJobEnded_12', 'a16a_12', 'YrsLivedInUSA_12', 'a22_12', 'a33b_12']
- Impute Missing Values
In [29]:
# Impute using median for numerics, and mode for categoricals
# df_train_cleaned = impute_missing_values(df, num_strategy="median", cat_strategy="mode")
# Use mean for numerics, and fill categorical with a custom label e.g. `missing`
# df_clean = impute_missing_values(df, num_strategy="mean", cat_strategy="missing")
In [30]:
missingValueImputer = Pipeline([
('missing_value_imputer', MissingValueImputer(num_strategy="median", cat_strategy="mode")),
])
print("*" * 100)
print(f"PipeLine | Impute Missing Values: {missingValueImputer}")
print("*" * 100)
# --- Applying Pipeline | Impute Missing Values ---
print("*" * 50)
print("--- Applying Pipeline | Impute Missing Values ---")
print("*" * 50)
df = missingValueImputer.fit_transform(df)
****************************************************************************************************
PipeLine | Impute Missing Values: Pipeline(steps=[('missing_value_imputer', MissingValueImputer())])
****************************************************************************************************
**************************************************
--- Applying Pipeline | Impute Missing Values ---
**************************************************
MissingValueImputer: Fitted. Numerical imputers: {'composite_score': None, 'Marriages_03': 1.0, 'Num_ADL_03': 0.0, 'Num_IADL_03': 0.0, 'Num_CES-D_Symptoms_03': 3.0, 'Num_Illnesses_03': 1.0, 'Marriages_12': 1.0, 'Num_ADL_12': 0.0, 'Num_IADL_12': 0.0, 'Num_CES-D_Symptoms_12': 3.0, 'Num_Illnesses_12': 1.0, 'JobHrsWeekly_03': 45.0, 'Earnings_03': 0.0, 'SpouseEarnings_03': 0.0, 'hincome_03': 30000.0, 'hinc_business_03': 0.0, 'hinc_rent_03': 0.0, 'hinc_assets_03': 0.0, 'hinc_cap_03': 0.0, 'Pension_03': 0.0, 'SpousePension_03': 0.0, 'JobHrsWeekly_12': 36.0, 'Earnings_12': 0.0, 'SpouseEarnings_12': 0.0, 'hincome_12': 20000.0, 'hinc_business_12': 0.0, 'hinc_rent_12': 0.0, 'hinc_assets_12': 0.0, 'hinc_cap_12': 0.0, 'Pension_12': 0.0, 'SpousePension_12': 0.0, 'PredictionYear': None}, Categorical imputers: {'Year': None, 'Age_03': '50-59', 'Urban_03': None, 'Married_03': None, 'Education_03': '1-5 Years', 'Num_Living_Child_03': '3 or 4', 'Migration_03': 0.0, 'GlobalHealth_03': 'Fair', 'ADL_Dress_03': 0.0, 'ADL_Walk_03': 0.0, 'ADL_Bath_03': 0.0, 'ADL_Eat_03': 0.0, 'ADL_Bed_03': 0.0, 'ADL_Toilet_03': 0.0, 'IADL_Money_03': 0.0, 'IADL_Meds_03': 0.0, 'IADL_Shop_03': 0.0, 'IADL_Meals_03': 0.0, 'Depressed_03': 0.0, 'Hard_03': 0.0, 'Restless_03': 0.0, 'Lonely_03': 0.0, 'Enjoy_03': 1.0, 'Sad_03': 0.0, 'Energetic_03': 0.0, 'CES-D_Symptoms_03': 0.0, 'Hypertension_03': None, 'Diabetes_03': None, 'Respiratory_Illness_03': None, 'Arthritis_03': None, 'HeartAttack_03': 0.0, 'Stroke_03': None, 'Cancer_03': None, 'BMI_03': 'Over Weight', 'Exercise_03': 0.0, 'Alcohol_03': 0.0, 'Tobacco_03': 0.0, 'Test_Cholestrol_03': 1.0, 'Test_Tuber_03': 0.0, 'Test_Diabetes_03': 1.0, 'Test_BloodPress_03': 1.0, 'Hospitalized_03': None, 'Visit_Dr_03': 1.0, 'OutPatient_03': None, 'Visit_Dental_03': None, 'Insured_03': None, 'FamilyDecisions_03': 'Both', 'Employment_03': 'Currently Working', 'Age_12': '60-69', 'Urban_12': 'Urban', 'Married_12': 'Married or In Civil Union', 'Education_12': '1-5 Years', 'Num_Living_Child_12': '3 or 4', 'Migration_12': 0.0, 'GlobalHealth_12': 'Fair', 'ADL_Dress_12': 0.0, 'ADL_Walk_12': 0.0, 'ADL_Bath_12': 0.0, 'ADL_Eat_12': 0.0, 'ADL_Bed_12': 0.0, 'ADL_Toilet_12': 0.0, 'IADL_Money_12': 0.0, 'IADL_Meds_12': 0.0, 'IADL_Shop_12': 0.0, 'IADL_Meals_12': 0.0, 'Depressed_12': 0.0, 'Hard_12': 0.0, 'Restless_12': 0.0, 'Lonely_12': 0.0, 'Enjoy_12': 1.0, 'Sad_12': 0.0, 'Energetic_12': 0.0, 'CES-D_Symptoms_12': 0.0, 'Hypertension_12': 0.0, 'Diabetes_12': 0.0, 'Respiratory_Illness_12': 0.0, 'Arthritis_12': 0.0, 'HeartAttack_12': 0.0, 'Stroke_12': 0.0, 'Cancer_12': 0.0, 'BMI_12': 'Over Weight', 'Exercise_12': 0.0, 'Alcohol_12': 0.0, 'Tobacco_12': 0.0, 'Test_Cholestrol_12': 1.0, 'Test_Tuber_12': 0.0, 'Test_Diabetes_12': 1.0, 'Test_BloodPress_12': 1.0, 'Hospitalized_12': 0.0, 'Visit_Dr_12': 1.0, 'OutPatient_12': 0.0, 'Visit_Dental_12': 0.0, 'Insured_12': 1.0, 'FamilyDecisions_12': 'Both', 'Employment_12': 'Dedicated to Household Chores', 'Vax_Flu_12': 1.0, 'Vax_Pneu_12': 0.0, 'CareAdult_12': 0.0, 'CareChild_12': 0.0, 'Volunteer_12': 0.0, 'AttendsClass_12': 0.0, 'AttendsClub_12': 0.0, 'Reads_12': 1.0, 'Games_12': 0.0, 'TableGames_12': 0.0, 'UseElectronicDevices_12': 1.0, 'HouseMaintenance_12': 1.0, 'TV_12': 1.0, 'Sewing_12': 0.0, 'Satement_Ideal_12': 'Agrees', 'Satement_Excel_12': 'Agrees', 'Satement_Fine_12': 'Agrees', 'COSAS_IMP_12': 'Agrees', 'WouldntChange_12': 'Agrees', 'Memory_12': 'Fair', 'Gender': None, 'EducationMother': 'Some Primary', 'EducationFather': 'Some Primary', 'SpouseGender_03': 'Man', 'Religon_Imp_03': 'Very Important', 'SpouseGender_12': 'Man', 'JobCatLongest_12': 'Artisans and Workers in Production, Repair, Maintenance', 'Religon_Imp_12': 'Very Important', 'Meet_FnF_12': 'Never', 'SocialActivities_12': 'Never', 'AttendReligiousServices_12': 0.0, 'SpeaksEnglish_12': 0.0, 'HousingEnvironment_12': 'Wood, Mosaic, or other Covering'}
MissingValueImputer: Number of missing values after imputation: 0
Save to .CSV¶
In [31]:
# print(f"Path to Save Cleaned File: {path_train_cleaned}")
In [32]:
# df.to_csv(path_train_cleaned, index=False, encoding="utf8")
# print("Done")